{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Chapter 6\n",
    "Deep learning for text and sequences"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# vectorization types:\n",
    " # a, one hot encoding \n",
    " # b, embedding : 1. training karai khud or\n",
    " #                2. pretrained embading use kersakte hai (Glove)\n",
    " # c, ngrams approch"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Loading the IMDB data for use with an Embedding layer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "max_features = 10000   # most common word\n",
    "\n",
    "from tensorflow.keras.datasets import imdb\n",
    "(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(25000,)\n",
      "(25000,)\n",
      "(25000,)\n",
      "(25000,)\n"
     ]
    }
   ],
   "source": [
    "print(x_train.shape)\n",
    "print(y_train.shape)\n",
    "print(x_test.shape)\n",
    "print(y_test.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# pad_sequences is used to ensure that all sequences in a list have the same length.\n",
    "\n",
    "maxlen = 20            # 20 word ka sentence hoga max\n",
    "\n",
    "from tensorflow.keras import preprocessing\n",
    "x_train = preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)\n",
    "x_test = preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(25000, 20)\n",
      "(25000,)\n",
      "(25000, 20)\n",
      "(25000,)\n"
     ]
    }
   ],
   "source": [
    "print(x_train.shape)\n",
    "print(y_train.shape)\n",
    "print(x_test.shape)\n",
    "print(y_test.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Using an Embedding layer and classifier on the IMDB data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Model: \"sequential\"\n",
      "_________________________________________________________________\n",
      "Layer (type)                 Output Shape              Param #   \n",
      "=================================================================\n",
      "embedding (Embedding)        (None, 20, 8)             80000     \n",
      "_________________________________________________________________\n",
      "flatten (Flatten)            (None, 160)               0         \n",
      "_________________________________________________________________\n",
      "dense (Dense)                (None, 1)                 161       \n",
      "=================================================================\n",
      "Total params: 80,161\n",
      "Trainable params: 80,161\n",
      "Non-trainable params: 0\n",
      "_________________________________________________________________\n"
     ]
    }
   ],
   "source": [
    "from tensorflow.keras.models import Sequential\n",
    "from tensorflow.keras.layers import Flatten, Dense\n",
    "from tensorflow.keras.layers import Embedding\n",
    "\n",
    "model = Sequential()\n",
    "model.add(Embedding(10000, 8, input_length=maxlen)) # max_features = 10000 sentences, 8 ? # maxlen = 20 max word\n",
    "model.add(Flatten())\n",
    "model.add(Dense(1, activation='sigmoid'))\n",
    "model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])\n",
    "model.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train on 20000 samples, validate on 5000 samples\n",
      "Epoch 1/10\n",
      "20000/20000 [==============================] - 4s 218us/sample - loss: 0.6728 - acc: 0.6108 - val_loss: 0.6232 - val_acc: 0.6954\n",
      "Epoch 2/10\n",
      "20000/20000 [==============================] - 4s 179us/sample - loss: 0.5440 - acc: 0.7498 - val_loss: 0.5255 - val_acc: 0.7288\n",
      "Epoch 3/10\n",
      "20000/20000 [==============================] - 3s 164us/sample - loss: 0.4630 - acc: 0.7864 - val_loss: 0.5007 - val_acc: 0.7464\n",
      "Epoch 4/10\n",
      "20000/20000 [==============================] - 3s 151us/sample - loss: 0.4263 - acc: 0.8038 - val_loss: 0.4959 - val_acc: 0.7500\n",
      "Epoch 5/10\n",
      "20000/20000 [==============================] - 3s 151us/sample - loss: 0.4022 - acc: 0.8188 - val_loss: 0.4961 - val_acc: 0.7524\n",
      "Epoch 6/10\n",
      "20000/20000 [==============================] - 3s 152us/sample - loss: 0.3826 - acc: 0.8293 - val_loss: 0.4991 - val_acc: 0.7518\n",
      "Epoch 7/10\n",
      "20000/20000 [==============================] - 3s 152us/sample - loss: 0.3640 - acc: 0.8415 - val_loss: 0.5035 - val_acc: 0.7540\n",
      "Epoch 8/10\n",
      "20000/20000 [==============================] - 3s 154us/sample - loss: 0.3465 - acc: 0.8510 - val_loss: 0.5087 - val_acc: 0.7524\n",
      "Epoch 9/10\n",
      "20000/20000 [==============================] - 3s 151us/sample - loss: 0.3290 - acc: 0.8615 - val_loss: 0.5142 - val_acc: 0.7514\n",
      "Epoch 10/10\n",
      "20000/20000 [==============================] - 3s 151us/sample - loss: 0.3119 - acc: 0.8698 - val_loss: 0.5207 - val_acc: 0.7480\n"
     ]
    }
   ],
   "source": [
    "history = model.fit(x_train, y_train,\n",
    "                    epochs=10,\n",
    "                    batch_size=32,\n",
    "                    validation_split=0.2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "25000/1 - 2s - loss: 0.5466 - acc: 0.7607\n",
      "\n",
      "Test loss : 51.13045286369323 %\n",
      "Test accuracy : 76.07200145721436 %\n"
     ]
    }
   ],
   "source": [
    "evaluation = model.evaluate(x_test,  y_test,batch_size=32, verbose=2)\n",
    "print()\n",
    "print(\"Test loss :\",evaluation[0]*100,\"%\")\n",
    "print(\"Test accuracy :\",evaluation[1]*100,\"%\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "It’s much better to add recurrent layers or 1D convolutional layers on top of the embedded sequences to learn features that take into account each sequence as a whole.\n",
    "That’s what we’ll focus on in the next few sections. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "conda_gpu",
   "language": "python",
   "name": "conda_gpu"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
